library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.3 v purrr 0.3.4
v tibble 3.1.2 v dplyr 1.0.6
v tidyr 1.1.3 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.1
-- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(skimr)
library(tidymodels)
Registered S3 method overwritten by 'tune':
method from
required_pkgs.model_spec parsnip
-- Attaching packages --------------------------------------------------------------------------- tidymodels 0.1.3 --
v broom 0.7.6 v rsample 0.1.0
v dials 0.0.9 v tune 0.1.5
v infer 0.5.4 v workflows 0.2.2
v modeldata 0.1.0 v workflowsets 0.0.2
v parsnip 0.1.6 v yardstick 0.0.8
v recipes 0.1.16
-- Conflicts ------------------------------------------------------------------------------ tidymodels_conflicts() --
x recipes::check() masks devtools::check()
x scales::discard() masks purrr::discard()
x dplyr::filter() masks stats::filter()
x recipes::fixed() masks stringr::fixed()
x dplyr::lag() masks stats::lag()
x yardstick::spec() masks readr::spec()
x recipes::step() masks stats::step()
* Use tidymodels_prefer() to resolve common conflicts.
library(corrplot)
corrplot 0.89 loaded
https://jhudatascience.org/tidyversecourse/model.html#case-studies-4
There are 48 predictors with values for 876 monitors (observations). The data comes from the US Environmental Protection Agency (EPA), the National Aeronautics and Space Administration (NASA), the US Census, and the National Center for Health Statistics (NCHS).
https://jhudatascience.org/tidyversecourse/model.html#data-import
pm <- read_csv("data/pm25_data.csv")
-- Column specification ---------------------------------------------------------------------------------------------
cols(
.default = col_double(),
state = col_character(),
county = col_character(),
city = col_character()
)
i Use `spec()` for the full column specifications.
glimpse(pm)
Rows: 876
Columns: 50
$ id <dbl> 1003.001, 1027.000, 1033.100, 1049.100, 1055.001, 1069.000, 1073.002, 1073.101,~
$ value <dbl> 9.597647, 10.800000, 11.212174, 11.659091, 12.375394, 10.508850, 15.591017, 12.~
$ fips <dbl> 1003, 1027, 1033, 1049, 1055, 1069, 1073, 1073, 1073, 1073, 1073, 1073, 1073, 1~
$ lat <dbl> 30.49800, 33.28126, 34.75878, 34.28763, 33.99375, 31.22636, 33.55306, 33.33111,~
$ lon <dbl> -87.88141, -85.80218, -87.65056, -85.96830, -85.99107, -85.39077, -86.81500, -8~
$ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "A~
$ county <chr> "Baldwin", "Clay", "Colbert", "DeKalb", "Etowah", "Houston", "Jefferson", "Jeff~
$ city <chr> "Fairhope", "Ashland", "Muscle Shoals", "Crossville", "Gadsden", "Dothan", "Bir~
$ CMAQ <dbl> 8.098836, 9.766208, 9.402679, 8.534772, 9.241744, 9.121892, 10.235612, 10.23561~
$ zcta <dbl> 36532, 36251, 35660, 35962, 35901, 36303, 35207, 35111, 35444, 35094, 35064, 35~
$ zcta_area <dbl> 190980522, 374132430, 16716984, 203836235, 154069359, 162685124, 26929603, 1662~
$ zcta_pop <dbl> 27829, 5103, 9042, 8300, 20045, 30217, 9010, 16140, 3699, 14212, 11458, 32390, ~
$ imp_a500 <dbl> 0.01730104, 1.96972318, 19.17301038, 5.78200692, 16.49307958, 19.13927336, 41.8~
$ imp_a1000 <dbl> 1.40960208, 0.85315744, 11.14489619, 3.86764706, 11.09645329, 16.67149654, 41.0~
$ imp_a5000 <dbl> 3.33601181, 0.98514787, 15.17861538, 1.23114143, 14.67778782, 16.38548211, 36.0~
$ imp_a10000 <dbl> 1.9879187, 0.5208189, 9.7253870, 1.0316469, 8.2200785, 8.0739624, 23.1790413, 4~
$ imp_a15000 <dbl> 1.4386207, 0.3359198, 5.2472094, 0.9730444, 5.1612102, 4.7401296, 17.4524844, 4~
$ county_area <dbl> 4117521611, 1564252280, 1534877333, 2012662359, 1385618994, 1501737720, 2878192~
$ county_pop <dbl> 182265, 13932, 54428, 71109, 104430, 101547, 658466, 658466, 194656, 658466, 65~
$ log_dist_to_prisec <dbl> 4.648181, 7.219907, 5.760131, 3.721489, 5.261457, 7.112373, 6.600958, 6.526896,~
$ log_pri_length_5000 <dbl> 8.517193, 8.517193, 8.517193, 8.517193, 9.066563, 8.517193, 11.156977, 10.91506~
$ log_pri_length_10000 <dbl> 9.210340, 9.210340, 9.274303, 10.409411, 11.137360, 9.210340, 11.849658, 11.728~
$ log_pri_length_15000 <dbl> 9.630228, 9.615805, 9.658899, 11.173626, 11.587258, 9.615805, 12.401245, 12.230~
$ log_pri_length_25000 <dbl> 11.32735, 10.12663, 10.15769, 11.90959, 12.01356, 10.12663, 12.98762, 12.86472,~
$ log_prisec_length_500 <dbl> 7.295356, 6.214608, 8.611945, 7.310155, 8.740680, 6.214608, 6.214608, 6.214608,~
$ log_prisec_length_1000 <dbl> 8.195119, 7.600902, 9.735569, 8.585843, 9.627898, 7.600902, 9.075921, 8.949239,~
$ log_prisec_length_5000 <dbl> 10.815042, 10.170878, 11.770407, 10.214200, 11.728889, 12.298627, 12.281645, 11~
$ log_prisec_length_10000 <dbl> 11.886803, 11.405543, 12.840663, 11.508944, 12.768279, 12.994141, 13.278416, 12~
$ log_prisec_length_15000 <dbl> 12.205723, 12.042963, 13.282656, 12.353663, 13.189489, 13.326132, 13.826365, 12~
$ log_prisec_length_25000 <dbl> 13.41395, 12.79980, 13.79973, 13.55979, 13.70026, 13.85550, 14.45221, 13.80017,~
$ log_nei_2008_pm25_sum_10000 <dbl> 0.318035438, 3.218632928, 6.573127301, 0.000000000, 3.772775514, 0.909136688, 7~
$ log_nei_2008_pm25_sum_15000 <dbl> 1.967358961, 3.218632928, 6.581917457, 3.282789151, 3.785646441, 3.174609213, 8~
$ log_nei_2008_pm25_sum_25000 <dbl> 5.067308, 3.218633, 6.875900, 4.887665, 4.160506, 3.190008, 8.651068, 7.806773,~
$ log_nei_2008_pm10_sum_10000 <dbl> 1.35588511, 3.31111648, 6.69187313, 0.00000000, 4.43719884, 0.92888890, 8.20975~
$ log_nei_2008_pm10_sum_15000 <dbl> 2.26783411, 3.31111648, 6.70127741, 3.35004443, 4.46267932, 3.67473904, 8.64883~
$ log_nei_2008_pm10_sum_25000 <dbl> 5.628728, 3.311116, 7.148858, 5.171920, 4.678311, 3.744629, 8.858019, 8.021072,~
$ popdens_county <dbl> 44.265706, 8.906492, 35.460814, 35.330814, 75.367038, 67.619664, 228.777633, 22~
$ popdens_zcta <dbl> 145.7164307, 13.6395554, 540.8870404, 40.7189625, 130.1037411, 185.7391706, 334~
$ nohs <dbl> 3.3, 11.6, 7.3, 14.3, 4.3, 5.8, 7.1, 2.7, 11.1, 7.2, 2.7, 0.8, 2.8, 9.7, 1.2, 3~
$ somehs <dbl> 4.9, 19.1, 15.8, 16.7, 13.3, 11.6, 17.1, 6.6, 11.6, 12.2, 9.8, 2.6, 8.2, 21.6, ~
$ hs <dbl> 25.1, 33.9, 30.6, 35.0, 27.8, 29.8, 37.2, 30.7, 46.0, 32.2, 28.8, 12.9, 32.0, 3~
$ somecollege <dbl> 19.7, 18.8, 20.9, 14.9, 29.2, 21.4, 23.5, 25.7, 17.2, 19.0, 26.3, 17.9, 28.9, 2~
$ associate <dbl> 8.2, 8.0, 7.6, 5.5, 10.1, 7.9, 7.3, 8.0, 4.1, 6.8, 12.4, 5.2, 7.4, 5.2, 6.5, 6.~
$ bachelor <dbl> 25.3, 5.5, 12.7, 7.9, 10.0, 13.7, 5.9, 17.6, 7.1, 14.8, 11.6, 35.5, 13.5, 2.2, ~
$ grad <dbl> 13.5, 3.1, 5.1, 5.8, 5.4, 9.8, 2.0, 8.7, 2.9, 7.7, 8.3, 25.2, 7.1, 0.4, 23.3, 4~
$ pov <dbl> 6.1, 19.5, 19.0, 13.8, 8.8, 15.6, 25.5, 7.3, 8.1, 10.5, 18.1, 2.1, 5.1, 13.3, 5~
$ hs_orless <dbl> 33.3, 64.6, 53.7, 66.0, 45.4, 47.2, 61.4, 40.0, 68.7, 51.6, 41.3, 16.3, 43.0, 7~
$ urc2013 <dbl> 4, 6, 4, 6, 4, 4, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 4, 3, 1, 5, 4, 2, 6, 4, 4~
$ urc2006 <dbl> 5, 6, 4, 5, 4, 4, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 4, 3, 1, 5, 4, 2, 6, 5, 4~
$ aod <dbl> 37.363636, 34.818182, 36.000000, 33.083333, 43.416667, 33.000000, 39.583333, 38~
pm <-pm %>%
mutate(across(c(id, fips, zcta), as_factor))
skim(pm)
-- Data Summary ------------------------
Values
Name pm
Number of rows 876
Number of columns 50
_______________________
Column type frequency:
character 3
factor 3
numeric 44
________________________
Group variables None
-- Variable type: character -----------------------------------------------------------------------------------------
# A tibble: 3 x 8
skim_variable n_missing complete_rate min max empty n_unique whitespace
* <chr> <int> <dbl> <int> <int> <int> <int> <int>
1 state 0 1 4 20 0 49 0
2 county 0 1 3 20 0 471 0
3 city 0 1 4 48 0 607 0
-- Variable type: factor --------------------------------------------------------------------------------------------
# A tibble: 3 x 6
skim_variable n_missing complete_rate ordered n_unique top_counts
* <chr> <int> <dbl> <lgl> <int> <chr>
1 id 0 1 FALSE 876 100: 1, 102: 1, 103: 1, 104: 1
2 fips 0 1 FALSE 569 170: 12, 603: 10, 261: 9, 107: 8
3 zcta 0 1 FALSE 842 475: 3, 110: 2, 160: 2, 290: 2
-- Variable type: numeric -------------------------------------------------------------------------------------------
# A tibble: 44 x 11
skim_variable n_missing complete_rate mean sd p0 p25
* <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 value 0 1 10.8 2.58e+0 3.02 9.27
2 lat 0 1 38.5 4.62e+0 25.5 35.0
3 lon 0 1 -91.7 1.50e+1 -124. -99.2
4 CMAQ 0 1 8.41 2.97e+0 1.63 6.53
5 zcta_area 0 1 183173482. 5.43e+8 15459 14204602.
6 zcta_pop 0 1 24228. 1.78e+4 0 9797
7 imp_a500 0 1 24.7 1.93e+1 0 3.70
8 imp_a1000 0 1 24.3 1.80e+1 0 5.32
9 imp_a5000 0 1 19.9 1.47e+1 0.0534 6.79
10 imp_a10000 0 1 15.8 1.38e+1 0.0942 4.54
11 imp_a15000 0 1 13.4 1.31e+1 0.108 3.24
12 county_area 0 1 3768701992. 6.21e+9 33703512 1116536298.
13 county_pop 0 1 687298. 1.29e+6 783 100948
14 log_dist_to_prisec 0 1 6.19 1.41e+0 -1.46 5.43
15 log_pri_length_5000 0 1 9.82 1.08e+0 8.52 8.52
16 log_pri_length_10000 0 1 10.9 1.13e+0 9.21 9.80
17 log_pri_length_15000 0 1 11.5 1.15e+0 9.62 10.9
18 log_pri_length_25000 0 1 12.2 1.10e+0 10.1 11.7
19 log_prisec_length_500 0 1 6.99 9.53e-1 6.21 6.21
20 log_prisec_length_1000 0 1 8.56 7.90e-1 7.60 7.60
21 log_prisec_length_5000 0 1 11.3 7.81e-1 8.52 10.9
22 log_prisec_length_10000 0 1 12.4 7.33e-1 9.21 12.0
23 log_prisec_length_15000 0 1 13.0 7.23e-1 9.62 12.6
24 log_prisec_length_25000 0 1 13.8 6.99e-1 10.1 13.4
25 log_nei_2008_pm25_sum_10000 0 1 3.97 2.35e+0 0 2.15
26 log_nei_2008_pm25_sum_15000 0 1 4.72 2.25e+0 0 3.47
27 log_nei_2008_pm25_sum_25000 0 1 5.67 2.11e+0 0 4.66
28 log_nei_2008_pm10_sum_10000 0 1 4.35 2.32e+0 0 2.69
29 log_nei_2008_pm10_sum_15000 0 1 5.10 2.18e+0 0 3.87
30 log_nei_2008_pm10_sum_25000 0 1 6.07 2.01e+0 0 5.10
31 popdens_county 0 1 552. 1.71e+3 0.263 40.8
32 popdens_zcta 0 1 1280. 2.76e+3 0 101.
33 nohs 0 1 6.99 7.21e+0 0 2.7
34 somehs 0 1 10.2 6.20e+0 0 5.9
35 hs 0 1 30.3 1.14e+1 0 23.8
36 somecollege 0 1 21.6 8.60e+0 0 17.5
37 associate 0 1 7.13 4.01e+0 0 4.9
38 bachelor 0 1 14.9 9.71e+0 0 8.8
39 grad 0 1 8.91 8.65e+0 0 3.9
40 pov 0 1 15.0 1.13e+1 0 6.5
41 hs_orless 0 1 47.5 1.68e+1 0 37.9
42 urc2013 0 1 2.92 1.52e+0 1 2
43 urc2006 0 1 2.97 1.52e+0 1 2
44 aod 0 1 43.7 1.96e+1 5 31.7
p50 p75 p100 hist
* <dbl> <dbl> <dbl> <chr>
1 11.2 12.4 2.32e 1 ▂▆▇▁▁
2 39.3 41.7 4.84e 1 ▁▃▅▇▂
3 -87.5 -80.7 -6.80e 1 ▃▂▃▇▃
4 8.62 10.2 2.31e 1 ▃▇▃▁▁
5 37653560. 160041508. 8.16e 9 ▇▁▁▁▁
6 22014 35005. 9.54e 4 ▇▇▃▁▁
7 25.1 40.2 6.96e 1 ▇▅▆▃▂
8 24.5 38.6 6.75e 1 ▇▅▆▃▁
9 19.1 30.1 7.46e 1 ▇▆▃▁▁
10 12.4 24.2 7.21e 1 ▇▃▂▁▁
11 9.67 20.6 7.11e 1 ▇▃▁▁▁
12 1690826566. 2878192209 5.19e10 ▇▁▁▁▁
13 280730. 743159 9.82e 6 ▇▁▁▁▁
14 6.36 7.15 1.05e 1 ▁▁▃▇▁
15 10.1 10.7 1.20e 1 ▇▂▆▅▂
16 11.2 11.8 1.30e 1 ▇▂▇▇▃
17 11.7 12.4 1.36e 1 ▆▂▇▇▃
18 12.5 13.1 1.44e 1 ▅▃▇▇▃
19 6.21 7.82 9.40e 0 ▇▁▂▂▁
20 8.66 9.20 1.05e 1 ▇▅▆▃▁
21 11.4 11.8 1.28e 1 ▁▁▃▇▃
22 12.5 12.9 1.38e 1 ▁▁▃▇▅
23 13.1 13.6 1.44e 1 ▁▁▃▇▅
24 13.9 14.4 1.52e 1 ▁▁▃▇▆
25 4.29 5.69 9.12e 0 ▆▅▇▆▂
26 5.00 6.35 9.42e 0 ▃▃▇▇▂
27 5.91 7.28 9.65e 0 ▂▂▇▇▃
28 4.62 6.07 9.34e 0 ▅▅▇▇▂
29 5.39 6.72 9.71e 0 ▂▃▇▇▂
30 6.37 7.52 9.88e 0 ▁▂▆▇▃
31 157. 511. 2.68e 4 ▇▁▁▁▁
32 610. 1383. 3.04e 4 ▇▁▁▁▁
33 5.1 8.8 1 e 2 ▇▁▁▁▁
34 9.4 13.9 7.22e 1 ▇▂▁▁▁
35 30.8 36.1 1 e 2 ▂▇▂▁▁
36 21.3 24.7 1 e 2 ▆▇▁▁▁
37 7.1 8.8 7.14e 1 ▇▁▁▁▁
38 13.0 19.2 1 e 2 ▇▂▁▁▁
39 6.7 11 1 e 2 ▇▁▁▁▁
40 12.1 21.2 6.59e 1 ▇▅▂▁▁
41 48.7 59.1 1 e 2 ▁▃▇▃▁
42 3 4 6 e 0 ▇▅▃▂▁
43 3 4 6 e 0 ▇▅▃▂▁
44 40.2 49.7 1.43e 2 ▃▇▁▁▁
PM_cor <- cor(pm %>% dplyr::select_if(is.numeric))
corrplot::corrplot(PM_cor, tl.cex = 0.5)
tidymodels
set.seed(1234)
pm_split <- rsample::initial_split(data = pm, prop = 2/3)
pm_split
<Analysis/Assess/Total>
<584/292/876>
Importantly the initial_split() function only determines what rows of our pm dataframe should be assigned for training or testing, it does not actually split the data.
To extract the testing and training data we can use the training() and testing() functions also of the rsample package.
train_pm <-rsample::training(pm_split)
test_pm <-rsample::testing(pm_split)
https://recipes.tidymodels.org/reference/index.html
# library(tidymodels)
simple_rec <- train_pm %>%
recipes::recipe(value ~ .)
simple_rec
Data Recipe
Inputs:
of id which is not a predictor
simple_rec <- train_pm %>%
recipes::recipe(value ~ .) %>%
recipes::update_role(id, new_role = "id variable")
simple_rec
Data Recipe
Inputs:
https://recipes.tidymodels.org/reference/index.html
All of the step functions look like step_() with the replaced with a name, except for the check functions which look like check_*().
There are several ways to select what variables to apply steps to:
We want to dummy encode our categorical variables so that they are numeric as we plan to use a linear regression for our model.
We will use the one-hot encoding means that we do not simply encode our categorical variables numerically, as our numeric assignments can be interpreted by algorithms as having a particular rank or order. Instead, binary variables made of 1s and 0s are used to arbitrarily assign a numeric value that has no apparent order.
simple_rec %>%
step_dummy(state, county, city, zcta, one_hot = TRUE)
Data Recipe
Inputs:
Operations:
Dummy variables from state, county, city, zcta
simple_rec %>%
update_role("fips", new_role = "county id")
Data Recipe
Inputs:
We also want to remove variables that appear to be redundant and are highly correlated with others, as we know from our exploratory data analysis that many of our variables are correlated with one another. We can do this using the step_corr() function.
simple_rec %>%
step_corr(all_predictors(), - CMAQ, - aod)
Data Recipe
Inputs:
Operations:
Correlation filter on all_predictors(), -CMAQ, -aod
simple_rec %>%
step_nzv(all_predictors(), - CMAQ, - aod)
Data Recipe
Inputs:
Operations:
Sparse, unbalanced variable filter on all_predictors(), -CMAQ, -aod
simple_rec <- train_pm %>%
recipes::recipe(value ~ .) %>%
recipes::update_role(id, new_role = "id variable") %>%
update_role("fips", new_role = "county id") %>%
step_dummy(state, county, city, zcta, one_hot = TRUE) %>%
step_corr(all_predictors(), - CMAQ, - aod)%>%
step_nzv(all_predictors(), - CMAQ, - aod)
simple_rec
Data Recipe
Inputs:
Operations:
Dummy variables from state, county, city, zcta
Correlation filter on all_predictors(), -CMAQ, -aod
Sparse, unbalanced variable filter on all_predictors(), -CMAQ, -aod
prepped_rec <- prep(simple_rec, verbose = TRUE, retain = TRUE)
oper 1 step dummy [training]
oper 2 step corr [training]
Warning in cor(x, use = use, method = method) :
the standard deviation is zero
Warning: The correlation matrix has missing values. 273 columns were excluded from the filter.
oper 3 step nzv [training]
The retained training set is ~ 0.26 Mb in memory.
we retained our preprocessed training data (i.e. prep(retain=TRUE)), we can take a look at it by using the bake() function of the recipes package
requires the new_data = NULL argument when we are using the training data.
preproc_train <- bake(prepped_rec, new_data = NULL)
glimpse(preproc_train)
Rows: 584
Columns: 37
$ id <fct> 18003.0004, 55041.0007, 6065.1003, 39009.0003, 39061.8001, 24510.0006, 6061.000~
$ fips <fct> 18003, 55041, 6065, 39009, 39061, 24510, 6061, 6065, 44003, 37111, 19113, 6031,~
$ lat <dbl> 41.09497, 45.56300, 33.94603, 39.44217, 39.18053, 39.34056, 38.74573, 33.85275,~
$ lon <dbl> -85.10182, -88.80880, -117.40063, -81.90883, -84.49215, -76.58222, -121.26631, ~
$ CMAQ <dbl> 10.383231, 3.411247, 11.404085, 7.971165, 11.791236, 10.940985, 8.814368, 7.719~
$ zcta_area <dbl> 16696709, 370280916, 41957182, 132383592, 5358625, 461424, 28131203, 94913381, ~
$ zcta_pop <dbl> 21306, 4141, 44001, 1115, 6566, 934, 41192, 26179, 6135, 30600, 40149, 26079, 3~
$ imp_a500 <dbl> 28.97837370, 0.00000000, 30.39013841, 0.00000000, 51.26198934, 23.86332180, 31.~
$ imp_a15000 <dbl> 13.0547959, 0.3676404, 23.7457506, 0.3307975, 21.7174564, 24.0009267, 17.909259~
$ county_area <dbl> 1702419942, 2626421270, 18664696661, 1304313482, 1051302780, 209643241, 3644136~
$ county_pop <dbl> 355329, 9304, 2189641, 64757, 802374, 620961, 348432, 2189641, 166158, 44996, 2~
$ log_dist_to_prisec <dbl> 6.621891, 8.415468, 7.419762, 6.344681, 5.778350, 6.290041, 7.018836, 6.838720,~
$ log_pri_length_5000 <dbl> 8.517193, 8.517193, 10.150514, 8.517193, 9.956985, 10.300814, 10.288845, 9.4423~
$ log_pri_length_25000 <dbl> 12.77378, 10.16440, 13.14450, 10.12663, 13.02123, 13.68658, 12.19710, 11.33494,~
$ log_prisec_length_500 <dbl> 6.214608, 6.214608, 6.214608, 6.214608, 7.141268, 6.214608, 6.214608, 6.214608,~
$ log_prisec_length_1000 <dbl> 9.240294, 7.600902, 7.600902, 8.793450, 8.407702, 9.451706, 7.600902, 8.094200,~
$ log_prisec_length_5000 <dbl> 11.485093, 9.425537, 10.155961, 10.562382, 11.698608, 12.086627, 10.288845, 10.~
$ log_prisec_length_10000 <dbl> 12.75582, 11.44833, 11.59563, 11.69093, 12.95940, 13.60081, 11.04273, 11.16916,~
$ log_nei_2008_pm10_sum_10000 <dbl> 4.91110140, 3.86982666, 4.03184660, 0.00000000, 5.89930186, 5.67208039, 4.20670~
$ log_nei_2008_pm10_sum_15000 <dbl> 5.399131255, 3.883688842, 5.459257126, 0.000000000, 6.294167687, 6.365952312, 4~
$ log_nei_2008_pm10_sum_25000 <dbl> 5.8160473, 3.8872637, 6.8845369, 3.7656347, 6.7347561, 8.6395685, 5.7761975, 3.~
$ popdens_county <dbl> 208.719947, 3.542463, 117.314577, 49.648341, 763.218756, 2961.989125, 95.614433~
$ popdens_zcta <dbl> 1276.059851, 11.183401, 1048.711994, 8.422494, 1225.314330, 2024.168660, 1464.2~
$ nohs <dbl> 4.3, 5.1, 3.7, 4.8, 2.1, 0.0, 2.5, 7.7, 0.8, 8.6, 1.4, 19.2, 8.9, 6.4, 8.3, 2.2~
$ somehs <dbl> 6.7, 10.4, 5.9, 11.5, 10.5, 0.0, 4.3, 7.5, 4.0, 13.9, 4.8, 25.4, 26.4, 9.5, 11.~
$ hs <dbl> 31.7, 40.3, 17.9, 47.3, 30.0, 0.0, 17.8, 21.7, 33.4, 33.4, 25.3, 26.5, 31.6, 26~
$ somecollege <dbl> 27.2, 24.1, 26.3, 20.0, 27.1, 0.0, 26.1, 26.0, 19.2, 18.1, 22.4, 20.0, 28.4, 26~
$ associate <dbl> 8.2, 7.4, 8.3, 3.1, 8.5, 71.4, 13.2, 7.6, 11.8, 11.0, 10.3, 5.4, 3.6, 4.3, 9.7,~
$ bachelor <dbl> 15.0, 8.6, 20.2, 9.8, 14.2, 0.0, 23.4, 17.2, 20.9, 10.2, 25.2, 2.3, 1.1, 19.1, ~
$ grad <dbl> 6.8, 4.2, 17.7, 3.5, 7.6, 28.6, 12.6, 12.3, 9.9, 4.8, 10.5, 1.1, 0.0, 8.4, 5.9,~
$ pov <dbl> 13.500, 18.900, 6.700, 14.400, 12.500, 3.525, 5.900, 10.300, 5.300, 14.000, 5.8~
$ hs_orless <dbl> 42.7, 55.8, 27.5, 63.6, 42.6, 0.0, 24.6, 36.9, 38.2, 55.9, 31.5, 71.1, 66.9, 42~
$ urc2006 <dbl> 3, 6, 1, 5, 1, 1, 2, 1, 2, 6, 4, 4, 4, 4, 4, 2, 5, 3, 5, 4, 3, 5, 5, 1, 1, 1, 3~
$ aod <dbl> 54.11111, 31.16667, 83.12500, 33.36364, 50.80000, 53.50000, 65.09091, 18.14286,~
$ value <dbl> 11.699065, 6.956780, 13.289744, 10.742000, 14.454783, 12.204167, 11.192063, 6.9~
$ state_California <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ city_Not.in.a.city <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
baked_test_pm <- recipes::bake(prepped_rec, new_data = test_pm)
Warning: There are new levels in a factor: Colbert, Etowah, Russell, Walker, Cochise, Coconino, Mohave, Arkansas, Crittenden, Garland, Phillips, Pope, Calaveras, Humboldt, Inyo, Merced, Monterey, San Benito, Sonoma, Tulare, Arapahoe, Elbert, Larimer, Mesa, Brevard, Leon, Sarasota, Paulding, Ada, Benewah, Lemhi, Shoshone, DuPage, McHenry, Winnebago, Elkhart, St. Joseph, Tippecanoe, Vanderburgh, Black Hawk, Pottawattamie, Van Buren, Wright, Boyd, Christian, Daviess, Hardin, Kenton, Pike, Calcasieu, St. Bernard, Tangipahoa, Baltimore, Cecil, Bristol, Berrien, Missaukee, Muskegon, Oakland, Dakota, Olmsted, Grenada, Lauderdale, Cascade, Sanders, Silver Bow, Scotts Bluff, Merrimack, Rockingham, Atlantic, Gloucester, Lea, San Juan, Santa Fe, Onondaga, Steuben, Westchester, Martin, Pitt, Swain, Watauga, Billings, Lorain, Muskogee, Sequoyah, Deschutes, Multnomah, Umatilla, Berks, Bucks, Cambria, Centre, Lackawanna, Northampton, York, Edgefield, Florence, Pennington, Bowie, Ector, Cache, Frederick, Brist [... truncated]
Warning: There are new levels in a factor: Muscle Shoals, Gadsden, Dothan, Chickasaw, Montgomery, Phenix City, Douglas, Flagstaff, Peach Springs, Apache Junction, Stuttgart, Hot Springs (Hot Springs National Park), Newport, Russellville, Springdale, San Andreas, Eureka, El Centro, Keeler, Lakeport, Azusa, Burbank, Merced, Salinas, Grass Valley, Anaheim, Quincy, Hollister, Ontario, Victorville, San Bernardino, Chula Vista, San Diego, San Jose, Live Oak, Santa Rosa, Visalia, Piru, Simi Valley, Littleton, Fort Collins, Grand Junction, Bridgeport, Dover, Wilmington, Melbourne, Tampa, Tallahassee, Palm Springs North, Orlando, Belle Glade, Ridge Wood Heights, Savannah, Kennesaw, Doraville, Boise (corporate name Boise City), Pinehurst (Pine Creek), Champaign, Summit, Naperville, Elgin, Zion, Cary, Alton, Braidwood, Rockford, Elkhart, Anderson, South Bend, Evansville, Waterloo, Keokuk, Council Bluffs, Clarion, Frankfort, Elizabethtown, Covington, Richmond, Pikeville, Vinton, Lake Charles, Chalmette, Co [... truncated]
glimpse(baked_test_pm)
Rows: 292
Columns: 37
$ id <fct> 1033.1002, 1055.001, 1069.0003, 1073.0023, 1073.1005, 1073.1009, 1073.5003, 109~
$ fips <fct> 1033, 1055, 1069, 1073, 1073, 1073, 1073, 1097, 1101, 1113, 1127, 4003, 4005, 4~
$ lat <dbl> 34.75878, 33.99375, 31.22636, 33.55306, 33.33111, 33.45972, 33.80167, 30.76994,~
$ lon <dbl> -87.65056, -85.99107, -85.39077, -86.81500, -87.00361, -87.30556, -86.94250, -8~
$ CMAQ <dbl> 9.402679, 9.241744, 9.121892, 10.235612, 10.235612, 8.161789, 8.816101, 8.65489~
$ zcta_area <dbl> 16716984, 154069359, 162685124, 26929603, 166239542, 385566685, 230081840, 1230~
$ zcta_pop <dbl> 9042, 20045, 30217, 9010, 16140, 3699, 13794, 6106, 12997, 8638, 10424, 1021, 6~
$ imp_a500 <dbl> 19.17301038, 16.49307958, 19.13927336, 41.83304498, 1.69765343, 0.00000000, 0.4~
$ imp_a15000 <dbl> 5.2472094, 5.1612102, 4.7401296, 17.4524844, 4.3006226, 0.1623359, 0.9481135, 7~
$ county_area <dbl> 1534877333, 1385618994, 1501737720, 2878192209, 2878192209, 3423328940, 2878192~
$ county_pop <dbl> 54428, 104430, 101547, 658466, 658466, 194656, 658466, 412992, 229363, 189885, ~
$ log_dist_to_prisec <dbl> 5.760131, 5.261457, 7.112373, 6.600958, 6.526896, 9.789557, 8.734821, 5.779170,~
$ log_pri_length_5000 <dbl> 8.517193, 9.066563, 8.517193, 11.156977, 10.915066, 8.517193, 8.517193, 10.3642~
$ log_pri_length_25000 <dbl> 10.15769, 12.01356, 10.12663, 12.98762, 12.86472, 10.12663, 11.85793, 12.23758,~
$ log_prisec_length_500 <dbl> 8.611945, 8.740680, 6.214608, 6.214608, 6.214608, 6.214608, 6.214608, 7.595229,~
$ log_prisec_length_1000 <dbl> 9.735569, 9.627898, 7.600902, 9.075921, 8.949239, 7.600902, 7.600902, 8.658654,~
$ log_prisec_length_5000 <dbl> 11.770407, 11.728889, 12.298627, 12.281645, 11.039001, 8.517193, 8.517193, 11.7~
$ log_prisec_length_10000 <dbl> 12.840663, 12.768279, 12.994141, 13.278416, 12.128956, 9.210340, 10.420770, 12.~
$ log_nei_2008_pm10_sum_10000 <dbl> 6.69187313, 4.43719884, 0.92888890, 8.20975958, 5.77118113, 0.01654963, 0.00000~
$ log_nei_2008_pm10_sum_15000 <dbl> 6.70127741, 4.46267932, 3.67473904, 8.64883455, 6.86889676, 0.01654963, 4.01669~
$ log_nei_2008_pm10_sum_25000 <dbl> 7.148858, 4.678311, 3.744629, 8.858019, 8.021072, 6.891992, 6.961580, 6.640326,~
$ popdens_county <dbl> 35.4608142, 75.3670384, 67.6196640, 228.7776327, 228.7776327, 56.8616114, 228.7~
$ popdens_zcta <dbl> 540.8870404, 130.1037411, 185.7391706, 334.5760426, 97.0888142, 9.5936712, 59.9~
$ nohs <dbl> 7.3, 4.3, 5.8, 7.1, 2.7, 11.1, 9.7, 3.0, 8.8, 8.6, 5.7, 32.7, 1.7, 23.9, 8.6, 3~
$ somehs <dbl> 15.8, 13.3, 11.6, 17.1, 6.6, 11.6, 21.6, 17.1, 19.3, 24.6, 14.3, 11.8, 0.0, 17.~
$ hs <dbl> 30.6, 27.8, 29.8, 37.2, 30.7, 46.0, 39.3, 39.0, 36.0, 27.7, 37.6, 45.0, 22.3, 2~
$ somecollege <dbl> 20.9, 29.2, 21.4, 23.5, 25.7, 17.2, 21.6, 22.4, 20.2, 19.4, 21.3, 7.9, 36.4, 20~
$ associate <dbl> 7.6, 10.1, 7.9, 7.3, 8.0, 4.1, 5.2, 6.6, 4.6, 4.3, 7.3, 2.6, 14.7, 4.3, 4.9, 10~
$ bachelor <dbl> 12.7, 10.0, 13.7, 5.9, 17.6, 7.1, 2.2, 7.8, 7.6, 9.5, 10.3, 0.0, 13.6, 4.2, 2.7~
$ grad <dbl> 5.1, 5.4, 9.8, 2.0, 8.7, 2.9, 0.4, 4.2, 3.5, 5.9, 3.5, 0.0, 11.4, 2.0, 0.6, 5.4~
$ pov <dbl> 19.0, 8.8, 15.6, 25.5, 7.3, 8.1, 13.3, 23.2, 23.0, 37.7, 19.3, 15.6, 64.3, 29.5~
$ hs_orless <dbl> 53.7, 45.4, 47.2, 61.4, 40.0, 68.7, 70.6, 59.1, 64.1, 60.9, 57.6, 89.5, 24.0, 6~
$ urc2006 <dbl> 4, 4, 4, 1, 1, 1, 2, 3, 3, 3, 2, 5, 4, 1, 5, 2, 6, 2, 4, 6, 5, 5, 3, 3, 6, 3, 5~
$ aod <dbl> 36.000000, 43.416667, 33.000000, 39.583333, 38.750000, 40.363636, 42.636364, 42~
$ value <dbl> 11.212174, 12.375394, 10.508850, 15.591017, 12.399355, 11.103704, 11.775000, 10~
$ state_California <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1~
$ city_Not.in.a.city <dbl> NA, NA, NA, 0, 1, 1, 1, NA, NA, NA, 0, NA, NA, 0, NA, NA, NA, 0, NA, NA, 0, NA,~
traincities <- train_pm %>% distinct(city)
testcities <- test_pm %>% distinct(city)
#get the number of cities that were different
dim(dplyr::setdiff(traincities, testcities))
[1] 381 1
#get the number of cities that overlapped
dim(dplyr::intersect(traincities, testcities))
[1] 55 1
So, let’s go back to our pm dataset and modify the city variable to just be values of in a city or not in a city using the case_when() function of dplyr. This function allows you to vectorize multiple if_else() statements.
pm %>% #count(city, sort = TRUE)
mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
city != "Not in a city" ~ "In a city")) %>%
select(state, county, city)
pm <- pm %>%
mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
city != "Not in a city" ~ "In a city"))
set.seed(1234) # same seed as before
pm_split <-rsample::initial_split(data = pm, prop = 2/3)
pm_split
<Analysis/Assess/Total>
<584/292/876>
train_pm <-rsample::training(pm_split)
test_pm <-rsample::testing(pm_split)
novel_rec <-train_pm %>%
recipe() %>%
update_role(everything(), new_role = "predictor") %>%
update_role(value, new_role = "outcome") %>%
update_role(id, new_role = "id variable") %>%
update_role("fips", new_role = "county id") %>%
step_dummy(state, county, city, zcta, one_hot = TRUE) %>%
step_corr(all_numeric()) %>%
step_nzv(all_numeric())
Now we will check the preprocessed data again to see if we still have NA values
prepped_rec <- prep(novel_rec, verbose = TRUE, retain = TRUE)
oper 1 step dummy [training]
oper 2 step corr [training]
Warning in cor(x, use = use, method = method) :
the standard deviation is zero
Warning: The correlation matrix has missing values. 273 columns were excluded from the filter.
oper 3 step nzv [training]
The retained training set is ~ 0.27 Mb in memory.
preproc_train <- bake(prepped_rec, new_data = NULL)
glimpse(preproc_train)
Rows: 584
Columns: 38
$ id <fct> 18003.0004, 55041.0007, 6065.1003, 39009.0003, 39061.8001, 24510.0006, 6061.000~
$ value <dbl> 11.699065, 6.956780, 13.289744, 10.742000, 14.454783, 12.204167, 11.192063, 6.9~
$ fips <fct> 18003, 55041, 6065, 39009, 39061, 24510, 6061, 6065, 44003, 37111, 19113, 6031,~
$ lat <dbl> 41.09497, 45.56300, 33.94603, 39.44217, 39.18053, 39.34056, 38.74573, 33.85275,~
$ lon <dbl> -85.10182, -88.80880, -117.40063, -81.90883, -84.49215, -76.58222, -121.26631, ~
$ CMAQ <dbl> 10.383231, 3.411247, 11.404085, 7.971165, 11.791236, 10.940985, 8.814368, 7.719~
$ zcta_area <dbl> 16696709, 370280916, 41957182, 132383592, 5358625, 461424, 28131203, 94913381, ~
$ zcta_pop <dbl> 21306, 4141, 44001, 1115, 6566, 934, 41192, 26179, 6135, 30600, 40149, 26079, 3~
$ imp_a500 <dbl> 28.97837370, 0.00000000, 30.39013841, 0.00000000, 51.26198934, 23.86332180, 31.~
$ imp_a15000 <dbl> 13.0547959, 0.3676404, 23.7457506, 0.3307975, 21.7174564, 24.0009267, 17.909259~
$ county_area <dbl> 1702419942, 2626421270, 18664696661, 1304313482, 1051302780, 209643241, 3644136~
$ county_pop <dbl> 355329, 9304, 2189641, 64757, 802374, 620961, 348432, 2189641, 166158, 44996, 2~
$ log_dist_to_prisec <dbl> 6.621891, 8.415468, 7.419762, 6.344681, 5.778350, 6.290041, 7.018836, 6.838720,~
$ log_pri_length_5000 <dbl> 8.517193, 8.517193, 10.150514, 8.517193, 9.956985, 10.300814, 10.288845, 9.4423~
$ log_pri_length_25000 <dbl> 12.77378, 10.16440, 13.14450, 10.12663, 13.02123, 13.68658, 12.19710, 11.33494,~
$ log_prisec_length_500 <dbl> 6.214608, 6.214608, 6.214608, 6.214608, 7.141268, 6.214608, 6.214608, 6.214608,~
$ log_prisec_length_1000 <dbl> 9.240294, 7.600902, 7.600902, 8.793450, 8.407702, 9.451706, 7.600902, 8.094200,~
$ log_prisec_length_5000 <dbl> 11.485093, 9.425537, 10.155961, 10.562382, 11.698608, 12.086627, 10.288845, 10.~
$ log_prisec_length_10000 <dbl> 12.75582, 11.44833, 11.59563, 11.69093, 12.95940, 13.60081, 11.04273, 11.16916,~
$ log_prisec_length_25000 <dbl> 13.98749, 13.15082, 13.44293, 13.58697, 14.59757, 14.87934, 12.99864, 12.53690,~
$ log_nei_2008_pm10_sum_10000 <dbl> 4.91110140, 3.86982666, 4.03184660, 0.00000000, 5.89930186, 5.67208039, 4.20670~
$ log_nei_2008_pm10_sum_15000 <dbl> 5.399131255, 3.883688842, 5.459257126, 0.000000000, 6.294167687, 6.365952312, 4~
$ log_nei_2008_pm10_sum_25000 <dbl> 5.8160473, 3.8872637, 6.8845369, 3.7656347, 6.7347561, 8.6395685, 5.7761975, 3.~
$ popdens_county <dbl> 208.719947, 3.542463, 117.314577, 49.648341, 763.218756, 2961.989125, 95.614433~
$ popdens_zcta <dbl> 1276.059851, 11.183401, 1048.711994, 8.422494, 1225.314330, 2024.168660, 1464.2~
$ nohs <dbl> 4.3, 5.1, 3.7, 4.8, 2.1, 0.0, 2.5, 7.7, 0.8, 8.6, 1.4, 19.2, 8.9, 6.4, 8.3, 2.2~
$ somehs <dbl> 6.7, 10.4, 5.9, 11.5, 10.5, 0.0, 4.3, 7.5, 4.0, 13.9, 4.8, 25.4, 26.4, 9.5, 11.~
$ hs <dbl> 31.7, 40.3, 17.9, 47.3, 30.0, 0.0, 17.8, 21.7, 33.4, 33.4, 25.3, 26.5, 31.6, 26~
$ somecollege <dbl> 27.2, 24.1, 26.3, 20.0, 27.1, 0.0, 26.1, 26.0, 19.2, 18.1, 22.4, 20.0, 28.4, 26~
$ associate <dbl> 8.2, 7.4, 8.3, 3.1, 8.5, 71.4, 13.2, 7.6, 11.8, 11.0, 10.3, 5.4, 3.6, 4.3, 9.7,~
$ bachelor <dbl> 15.0, 8.6, 20.2, 9.8, 14.2, 0.0, 23.4, 17.2, 20.9, 10.2, 25.2, 2.3, 1.1, 19.1, ~
$ grad <dbl> 6.8, 4.2, 17.7, 3.5, 7.6, 28.6, 12.6, 12.3, 9.9, 4.8, 10.5, 1.1, 0.0, 8.4, 5.9,~
$ pov <dbl> 13.500, 18.900, 6.700, 14.400, 12.500, 3.525, 5.900, 10.300, 5.300, 14.000, 5.8~
$ hs_orless <dbl> 42.7, 55.8, 27.5, 63.6, 42.6, 0.0, 24.6, 36.9, 38.2, 55.9, 31.5, 71.1, 66.9, 42~
$ urc2006 <dbl> 3, 6, 1, 5, 1, 1, 2, 1, 2, 6, 4, 4, 4, 4, 4, 2, 5, 3, 5, 4, 3, 5, 5, 1, 1, 1, 3~
$ aod <dbl> 54.11111, 31.16667, 83.12500, 33.36364, 50.80000, 53.50000, 65.09091, 18.14286,~
$ state_California <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ city_Not.in.a.city <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
baked_test_pm <- recipes::bake(prepped_rec, new_data = test_pm)
Warning: There are new levels in a factor: Colbert, Etowah, Russell, Walker, Cochise, Coconino, Mohave, Arkansas, Crittenden, Garland, Phillips, Pope, Calaveras, Humboldt, Inyo, Merced, Monterey, San Benito, Sonoma, Tulare, Arapahoe, Elbert, Larimer, Mesa, Brevard, Leon, Sarasota, Paulding, Ada, Benewah, Lemhi, Shoshone, DuPage, McHenry, Winnebago, Elkhart, St. Joseph, Tippecanoe, Vanderburgh, Black Hawk, Pottawattamie, Van Buren, Wright, Boyd, Christian, Daviess, Hardin, Kenton, Pike, Calcasieu, St. Bernard, Tangipahoa, Baltimore, Cecil, Bristol, Berrien, Missaukee, Muskegon, Oakland, Dakota, Olmsted, Grenada, Lauderdale, Cascade, Sanders, Silver Bow, Scotts Bluff, Merrimack, Rockingham, Atlantic, Gloucester, Lea, San Juan, Santa Fe, Onondaga, Steuben, Westchester, Martin, Pitt, Swain, Watauga, Billings, Lorain, Muskogee, Sequoyah, Deschutes, Multnomah, Umatilla, Berks, Bucks, Cambria, Centre, Lackawanna, Northampton, York, Edgefield, Florence, Pennington, Bowie, Ector, Cache, Frederick, Brist [... truncated]
glimpse(baked_test_pm)
Rows: 292
Columns: 38
$ id <fct> 1033.1002, 1055.001, 1069.0003, 1073.0023, 1073.1005, 1073.1009, 1073.5003, 109~
$ value <dbl> 11.212174, 12.375394, 10.508850, 15.591017, 12.399355, 11.103704, 11.775000, 10~
$ fips <fct> 1033, 1055, 1069, 1073, 1073, 1073, 1073, 1097, 1101, 1113, 1127, 4003, 4005, 4~
$ lat <dbl> 34.75878, 33.99375, 31.22636, 33.55306, 33.33111, 33.45972, 33.80167, 30.76994,~
$ lon <dbl> -87.65056, -85.99107, -85.39077, -86.81500, -87.00361, -87.30556, -86.94250, -8~
$ CMAQ <dbl> 9.402679, 9.241744, 9.121892, 10.235612, 10.235612, 8.161789, 8.816101, 8.65489~
$ zcta_area <dbl> 16716984, 154069359, 162685124, 26929603, 166239542, 385566685, 230081840, 1230~
$ zcta_pop <dbl> 9042, 20045, 30217, 9010, 16140, 3699, 13794, 6106, 12997, 8638, 10424, 1021, 6~
$ imp_a500 <dbl> 19.17301038, 16.49307958, 19.13927336, 41.83304498, 1.69765343, 0.00000000, 0.4~
$ imp_a15000 <dbl> 5.2472094, 5.1612102, 4.7401296, 17.4524844, 4.3006226, 0.1623359, 0.9481135, 7~
$ county_area <dbl> 1534877333, 1385618994, 1501737720, 2878192209, 2878192209, 3423328940, 2878192~
$ county_pop <dbl> 54428, 104430, 101547, 658466, 658466, 194656, 658466, 412992, 229363, 189885, ~
$ log_dist_to_prisec <dbl> 5.760131, 5.261457, 7.112373, 6.600958, 6.526896, 9.789557, 8.734821, 5.779170,~
$ log_pri_length_5000 <dbl> 8.517193, 9.066563, 8.517193, 11.156977, 10.915066, 8.517193, 8.517193, 10.3642~
$ log_pri_length_25000 <dbl> 10.15769, 12.01356, 10.12663, 12.98762, 12.86472, 10.12663, 11.85793, 12.23758,~
$ log_prisec_length_500 <dbl> 8.611945, 8.740680, 6.214608, 6.214608, 6.214608, 6.214608, 6.214608, 7.595229,~
$ log_prisec_length_1000 <dbl> 9.735569, 9.627898, 7.600902, 9.075921, 8.949239, 7.600902, 7.600902, 8.658654,~
$ log_prisec_length_5000 <dbl> 11.770407, 11.728889, 12.298627, 12.281645, 11.039001, 8.517193, 8.517193, 11.7~
$ log_prisec_length_10000 <dbl> 12.840663, 12.768279, 12.994141, 13.278416, 12.128956, 9.210340, 10.420770, 12.~
$ log_prisec_length_25000 <dbl> 13.79973, 13.70026, 13.85550, 14.45221, 13.80017, 11.89377, 13.34258, 13.90072,~
$ log_nei_2008_pm10_sum_10000 <dbl> 6.69187313, 4.43719884, 0.92888890, 8.20975958, 5.77118113, 0.01654963, 0.00000~
$ log_nei_2008_pm10_sum_15000 <dbl> 6.70127741, 4.46267932, 3.67473904, 8.64883455, 6.86889676, 0.01654963, 4.01669~
$ log_nei_2008_pm10_sum_25000 <dbl> 7.148858, 4.678311, 3.744629, 8.858019, 8.021072, 6.891992, 6.961580, 6.640326,~
$ popdens_county <dbl> 35.4608142, 75.3670384, 67.6196640, 228.7776327, 228.7776327, 56.8616114, 228.7~
$ popdens_zcta <dbl> 540.8870404, 130.1037411, 185.7391706, 334.5760426, 97.0888142, 9.5936712, 59.9~
$ nohs <dbl> 7.3, 4.3, 5.8, 7.1, 2.7, 11.1, 9.7, 3.0, 8.8, 8.6, 5.7, 32.7, 1.7, 23.9, 8.6, 3~
$ somehs <dbl> 15.8, 13.3, 11.6, 17.1, 6.6, 11.6, 21.6, 17.1, 19.3, 24.6, 14.3, 11.8, 0.0, 17.~
$ hs <dbl> 30.6, 27.8, 29.8, 37.2, 30.7, 46.0, 39.3, 39.0, 36.0, 27.7, 37.6, 45.0, 22.3, 2~
$ somecollege <dbl> 20.9, 29.2, 21.4, 23.5, 25.7, 17.2, 21.6, 22.4, 20.2, 19.4, 21.3, 7.9, 36.4, 20~
$ associate <dbl> 7.6, 10.1, 7.9, 7.3, 8.0, 4.1, 5.2, 6.6, 4.6, 4.3, 7.3, 2.6, 14.7, 4.3, 4.9, 10~
$ bachelor <dbl> 12.7, 10.0, 13.7, 5.9, 17.6, 7.1, 2.2, 7.8, 7.6, 9.5, 10.3, 0.0, 13.6, 4.2, 2.7~
$ grad <dbl> 5.1, 5.4, 9.8, 2.0, 8.7, 2.9, 0.4, 4.2, 3.5, 5.9, 3.5, 0.0, 11.4, 2.0, 0.6, 5.4~
$ pov <dbl> 19.0, 8.8, 15.6, 25.5, 7.3, 8.1, 13.3, 23.2, 23.0, 37.7, 19.3, 15.6, 64.3, 29.5~
$ hs_orless <dbl> 53.7, 45.4, 47.2, 61.4, 40.0, 68.7, 70.6, 59.1, 64.1, 60.9, 57.6, 89.5, 24.0, 6~
$ urc2006 <dbl> 4, 4, 4, 1, 1, 1, 2, 3, 3, 3, 2, 5, 4, 1, 5, 2, 6, 2, 4, 6, 5, 5, 3, 3, 6, 3, 5~
$ aod <dbl> 36.000000, 43.416667, 33.000000, 39.583333, 38.750000, 40.363636, 42.636364, 42~
$ state_California <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1~
$ city_Not.in.a.city <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
Great, now we no longer have NA values!
For our case, we are going to start our analysis with a linear regression but we will demonstrate how we can try different models.
PM_model <- parsnip::linear_reg() # PM was used in the name for particulate matter
PM_model
Linear Regression Model Specification (regression)
OK. So far, all we have defined is that we want to use a linear regression… Let’s tell parsnip more about what we want.
lm_PM_model <-
PM_model %>%
parsnip::set_engine("lm")
lm_PM_model
Linear Regression Model Specification (regression)
Computational engine: lm
Here, we aim to predict the air pollution. You can do this with the set_mode() function of the parsnip package, by using either set_mode(“classification”) or set_mode(“regression”).
lm_PM_model <-
PM_model %>%
parsnip::set_engine("lm") %>%
set_mode("regression")
lm_PM_model
Linear Regression Model Specification (regression)
Computational engine: lm
PM_wflow <-workflows::workflow() %>%
workflows::add_recipe(novel_rec) %>%
workflows::add_model(lm_PM_model)
PM_wflow
== Workflow =========================================================================================================
Preprocessor: Recipe
Model: linear_reg()
-- Preprocessor -----------------------------------------------------------------------------------------------------
3 Recipe Steps
* step_dummy()
* step_corr()
* step_nzv()
-- Model ------------------------------------------------------------------------------------------------------------
Linear Regression Model Specification (regression)
Computational engine: lm
PM_wflow_fit <- parsnip::fit(PM_wflow, data = train_pm)
Warning in cor(x, use = use, method = method) :
the standard deviation is zero
Warning: The correlation matrix has missing values. 273 columns were excluded from the filter.
PM_wflow_fit
== Workflow [trained] ===============================================================================================
Preprocessor: Recipe
Model: linear_reg()
-- Preprocessor -----------------------------------------------------------------------------------------------------
3 Recipe Steps
* step_dummy()
* step_corr()
* step_nzv()
-- Model ------------------------------------------------------------------------------------------------------------
Call:
stats::lm(formula = ..y ~ ., data = data)
Coefficients:
(Intercept) lat lon CMAQ
2.936e+02 3.261e-02 1.586e-02 2.463e-01
zcta_area zcta_pop imp_a500 imp_a15000
-3.433e-10 1.013e-05 5.064e-03 -3.066e-03
county_area county_pop log_dist_to_prisec log_pri_length_5000
-2.324e-11 -7.576e-08 6.214e-02 -2.006e-01
log_pri_length_25000 log_prisec_length_500 log_prisec_length_1000 log_prisec_length_5000
-5.411e-02 2.204e-01 1.154e-01 2.374e-01
log_prisec_length_10000 log_prisec_length_25000 log_nei_2008_pm10_sum_10000 log_nei_2008_pm10_sum_15000
-3.436e-02 5.224e-01 1.829e-01 -2.355e-02
log_nei_2008_pm10_sum_25000 popdens_county popdens_zcta nohs
2.403e-02 2.203e-05 -2.132e-06 -2.983e+00
somehs hs somecollege associate
-2.956e+00 -2.962e+00 -2.967e+00 -2.999e+00
bachelor grad pov hs_orless
-2.979e+00 -2.978e+00 1.859e-03 NA
urc2006 aod state_California city_Not.in.a.city
2.577e-01 1.535e-02 3.114e+00 -4.250e-02
wflowoutput <- PM_wflow_fit %>%
pull_workflow_fit() %>%
broom::tidy()
wflowoutput
PM_wflow_fit %>%
pull_workflow_fit() %>%
vip::vip(num_features = 10)
Machine learning (ML) is an optimization problem that tries to minimize the distance between our predicted outcome \(\hat Y = f(X)\) and actual outcome \(Y\) using our features (or predictor variables) \(X\) as input to a function \(f\) that we want to estimate.
As our goal in this section is to assess overall model performance,
wf_fit <- PM_wflow_fit %>%
pull_workflow_fit()
First, let’s pull out our predicted outcome values \(\hat Y = f(X)\)
wf_fitted_values <- wf_fit$fit$fitted.values
head(wf_fitted_values)
1 2 3 4 5 6
12.186782 9.139406 12.646119 10.377628 11.909934 9.520860
# or
wf_fitted_values <-
broom::augment(wf_fit$fit, data = preproc_train) %>%
select(value, .fitted:.std.resid)
wf_fitted_values #see .fitted variable
Or use the predict() fucntion. we use the actual workflow here, we can (and actually need to) use the raw data instead of the preprocessed data
Now, we can compare the predicted outcome values (or fitted values) \(\hat Y\) to the actual outcome values \(Y\) that we observed:
library(ggplot2)
wf_fitted_values %>%
ggplot(aes(x = value, y = .fitted)) +
geom_point() +
xlab("actual outcome values") +
ylab("predicted outcome values")
Next, let’s use different distance functions \(d(\cdot)\) to assess how far off our predicted outcome \(\hat Y = f(X)\) and actual outcome \(Y\) values are from each other:
\[d(Y- \hat Y)\] There are entire scholarly fields of research dedicated to identifying different distance metrics \(d(\cdot)\) for machine learning applications. However, we will focus on root mean squared error ( rmse )
\[ RMSE = \sqrt{\frac{\sum_{i=0}^n(\hat y = y_t)^2}{n}}\] One way to calculate these metrics within the tidymodels framework is to use the yardstick package using the metrics() function.
We also intend to perform cross validation, so we will now split the training data further using the vfold_cv() function of the rsample package.
set.seed(1234)
vfold_pm <- rsample::vfold_cv(data = train_pm, v = 10)
vfold_pm
# 10-fold cross-validation
pull(vfold_pm, splits)
[[1]]
<Analysis/Assess/Total>
<525/59/584>
[[2]]
<Analysis/Assess/Total>
<525/59/584>
[[3]]
<Analysis/Assess/Total>
<525/59/584>
[[4]]
<Analysis/Assess/Total>
<525/59/584>
[[5]]
<Analysis/Assess/Total>
<526/58/584>
[[6]]
<Analysis/Assess/Total>
<526/58/584>
[[7]]
<Analysis/Assess/Total>
<526/58/584>
[[8]]
<Analysis/Assess/Total>
<526/58/584>
[[9]]
<Analysis/Assess/Total>
<526/58/584>
[[10]]
<Analysis/Assess/Total>
<526/58/584>
We can fit the model to our cross validation folds using the fit_resamples() function of the tune package, by specifying our workflow object and the cross validation fold object we just created. See here for more information.
set.seed(122)
resample_fit <- tune::fit_resamples(PM_wflow, vfold_pm)
! Fold01: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 330...
! Fold01: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Forest, Niagara, Sp...
! Fold02: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 328...
! Fold02: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Macon, Arlington, C...
! Fold03: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 331...
! Fold03: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Oconee, Cobb, Vigo,...
! Fold04: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 324...
! Fold04: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Allen, McDowell, Ch...
! Fold05: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 328...
! Fold05: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Nevada, There are n...
! Fold06: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 326...
! Fold06: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Yuma, Flathead, Gra...
! Fold07: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 331...
! Fold07: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Hampton City, Rando...
! Fold08: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 328...
! Fold08: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Athens, Cabell, Dou...
! Fold09: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 331...
! Fold09: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Maine, North Dakota...
! Fold10: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 329...
! Fold10: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Stark, St. Lucie, A...
We can now take a look at various performance metrics based on the fit of our cross validation “resamples.”
To do this we will use the collect_metrics() function of the tune package. This will show us the mean of the accuracy estimate of the different cross validation folds.
resample_fit
Warning: This tuning result has notes. Example notes on model fitting include:
preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Macon, Arlington, Champaign, Citrus, Klamath, Cumberland, Ventura, Monongalia, Dona Ana, Ottawa, Tuscaloosa, Edgecombe, Peoria, Macomb, Teton, Tooele, St. Lawrence, Lincoln, Gibson, prediction from a rank-deficient fit may be misleading
preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 326 columns were excluded from the filter.
preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Hampton City, Randolph, Hinds, Codington, Dauphin, Bennington, Brookings, Raleigh, Passaic, Duplin, Erie, Page, Bullitt, Stanislaus, Cameron, Lucas, Davis, Santa Clara, Mayes, White, DeSoto, Chautauqua, Terrebonne, Harford, Spokane, La Salle, prediction from a rank-deficient fit may be misleading
# Resampling results
# 10-fold cross-validation
RF_rec <- recipe(train_pm) %>%
update_role(everything(), new_role = "predictor")%>%
update_role(value, new_role = "outcome")%>%
update_role(id, new_role = "id variable") %>%
update_role("fips", new_role = "county id") %>%
step_novel("state") %>%
step_string2factor("state", "county", "city") %>%
step_rm("county") %>%
step_rm("zcta") %>%
step_corr(all_numeric())%>%
step_nzv(all_numeric())
PMtree_model <-
parsnip::rand_forest(mtry = 10, min_n = 4)
PMtree_model
Random Forest Model Specification (unknown)
Main Arguments:
mtry = 10
min_n = 4
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:ggplot2’:
margin
RF_PM_model <-
PMtree_model %>%
set_engine("randomForest") %>%
set_mode("regression")
RF_PM_model
Random Forest Model Specification (regression)
Main Arguments:
mtry = 10
min_n = 4
Computational engine: randomForest
RF_wflow <- workflows::workflow() %>%
workflows::add_recipe(RF_rec) %>%
workflows::add_model(RF_PM_model)
RF_wflow
== Workflow ============================================================================================
Preprocessor: Recipe
Model: rand_forest()
-- Preprocessor ----------------------------------------------------------------------------------------
6 Recipe Steps
* step_novel()
* step_string2factor()
* step_rm()
* step_rm()
* step_corr()
* step_nzv()
-- Model -----------------------------------------------------------------------------------------------
Random Forest Model Specification (regression)
Main Arguments:
mtry = 10
min_n = 4
Computational engine: randomForest
RF_wflow_fit <- parsnip::fit(RF_wflow, data = train_pm)
RF_wflow_fit
== Workflow [trained] ==================================================================================
Preprocessor: Recipe
Model: rand_forest()
-- Preprocessor ----------------------------------------------------------------------------------------
6 Recipe Steps
* step_novel()
* step_string2factor()
* step_rm()
* step_rm()
* step_corr()
* step_nzv()
-- Model -----------------------------------------------------------------------------------------------
Call:
randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~10, x), nodesize = min_rows(~4, x))
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 10
Mean of squared residuals: 2.738
% Var explained: 57.67
RF_wflow_fit %>%
pull_workflow_fit() %>%
vip::vip(num_features = 10)
set.seed(456)
resample_RF_fit <- tune::fit_resamples(RF_wflow, vfold_pm)
collect_metrics(resample_RF_fit)
RF_PM_model <-
parsnip::rand_forest(mtry = 10, min_n = 4) %>%
set_engine("randomForest") %>%
set_mode("regression")
RF_PM_model
Random Forest Model Specification (regression)
Main Arguments:
mtry = 10
min_n = 4
Computational engine: randomForest
tune_RF_model <- rand_forest(mtry = tune(), min_n = tune()) %>%
set_engine("randomForest") %>%
set_mode("regression")
tune_RF_model
Random Forest Model Specification (regression)
Main Arguments:
mtry = tune()
min_n = tune()
Computational engine: randomForest
add to workflow
RF_tune_wflow <- workflows::workflow() %>%
workflows::add_recipe(RF_rec) %>%
workflows::add_model(tune_RF_model)
RF_tune_wflow
== Workflow ============================================================================================
Preprocessor: Recipe
Model: rand_forest()
-- Preprocessor ----------------------------------------------------------------------------------------
6 Recipe Steps
* step_novel()
* step_string2factor()
* step_rm()
* step_rm()
* step_corr()
* step_nzv()
-- Model -----------------------------------------------------------------------------------------------
Random Forest Model Specification (regression)
Main Arguments:
mtry = tune()
min_n = tune()
Computational engine: randomForest
Tune in parallel
detectCores()
[1] 12
twelve cores
tune_RF_results <- tune_grid(object = RF_tune_wflow, resamples = vfold_pm, grid = 20)
i Creating pre-processing data to finalize unknown parameter: mtry
tune_RF_results
# Tuning results
# 10-fold cross-validation
tune_RF_results%>%
collect_metrics() %>%
head()
show_best(tune_RF_results, metric = "rmse", n =1)
tuned_RF_values<- select_best(tune_RF_results, "rmse")
tuned_RF_values
RF_tuned_wflow <-RF_tune_wflow %>%
tune::finalize_workflow(tuned_RF_values)
RF_tuned_wflow
== Workflow ============================================================================================
Preprocessor: Recipe
Model: rand_forest()
-- Preprocessor ----------------------------------------------------------------------------------------
6 Recipe Steps
* step_novel()
* step_string2factor()
* step_rm()
* step_rm()
* step_corr()
* step_nzv()
-- Model -----------------------------------------------------------------------------------------------
Random Forest Model Specification (regression)
Main Arguments:
mtry = 15
min_n = 3
Computational engine: randomForest
test_predictions <-collect_predictions(overallfit)
test_predictions
test_predictions %>%
ggplot(aes(x = value, y = .pred)) +
geom_point() +
xlab("actual outcome values") +
ylab("predicted outcome values") +
geom_smooth(method = "lm")
`geom_smooth()` using formula 'y ~ x'